Test models OTB
- In what follows, we test the models against the filtered TESLA peptides. Some of the models must be compiled so are not included, although some models are in the form of simple R/Python scripts, and thus can be executed via this document. We attempt to clarify where this is or isnt possible.
IEDB
Run
TEST_DATA_LOCATION="NEOANTIGEN_TESLA/IEDB_OTB/"
foreach(allele_i=1:length(unique(FullDataset$HLA_Allele)))%do%{
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.txt")
system(paste0("python IEDB_Immunogenicity_Model_Calis/immunogenicity_model/predict_immunogenicity.py ",testdata,
" --allele=",HLA_ALLELE_FOR_TESTING," > ",RESULTS_OUTPUT))
}
Read
- Below reads in the output from executing the model.
TEST_DATA_LOCATION="NEOANTIGEN_TESLA/IEDB_OTB/"
files <- dir(TEST_DATA_LOCATION, pattern = "*_Results")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
IEDB_RESULTS <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Files are output from IEDB per allele and saved with the allele in the file name. Allele is not output in the data, so the below extracts alllele information into a column
IEDB_RESULTS$file=gsub(x=IEDB_RESULTS$file,pattern="_Results.txt",replacement="")
IEDB_RESULTS=IEDB_RESULTS %>% mutate(HLA_Allele = gsub(x=IEDB_RESULTS$file,pattern="Allele_",replacement = ""))
# Map the allele in model output to allele nomenclature in our test dataset
IEDB_RESULTS$HLA_Allele = ClosestMatch2(IEDB_RESULTS$HLA_Allele,unique(FullDataset$HLA_Allele))
# Clean the data table and join it with the original full dataset
IEDB_RESULTS=IEDB_RESULTS %>% dplyr::rename(Peptide=peptide, ImmunogenicityScore=score) %>% select(!file)
IEDB_RESULTS=IEDB_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele")) %>% select(!length) %>% mutate(Dataset = "IEDB")
print(c("IEDB Results has same number of rows as test dataset? : ",IEDB_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IEDB Results has same number of rows as test dataset? : "
## [2] "TRUE"
NetTepi
Run
- The below requires NetTepi to be installed in folder /Applications/netTepi-1.0_orig folder (macOS)
- Seperates peptides by HLA, although for the GBM analysis there is only A0201 bound peptides. Outputs analysis into /NETTEPI_OTB folder.
TEST_DATA_LOCATION="NEOANTIGEN_TESLA/NETTEPI_OTB/"
# For each allele
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*",replacement = "")
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
# Filter the data for the run
data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
# What lengths are used on this run?
lengths = data_for_run %>% mutate(Length=Biostrings::width(Peptide))%>% pull(Length)%>% unique
# Write out data for this run
write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.xls")
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
system(paste0("/Applications/netTepi-1.0_orig/netTepi -a ",HLA_ALLELE_FOR_TESTING," -p ",testdata," -xlsfile ",RESULTS_OUTPUT," -l ",paste0(lengths,collapse = ",")))
system(paste0("mv ",RESULTS_OUTPUT," " ,gsub(x=RESULTS_OUTPUT,pattern=".xls",replacement=""),".csv"))
}
read in output
- Below reads in the output from executing the model.
# Read output files in and combine them
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/NETTEPI_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = "_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
# Get rid of columns we dont need.
NetTepi_Results <- unnest(data3) %>% as.data.table %>% dplyr::select(!c(Pos,Identity,Aff,Stab,Tcell,`%Rank`,file))
# Combined score becomes the 'immunogenicity score'.
NetTepi_Results=NetTepi_Results %>% dplyr::rename(ImmunogenicityScore=Comb,HLA_Allele=Allele)
#Allele formatting as input and output is different, so we map the alleles between NetTepi output and our test dataset by similarity.
NetTepi_Results$HLA_Allele = ClosestMatch2(NetTepi_Results$HLA_Allele,unique(FullDataset$HLA_Allele))
# Below confirms that the above matching works as otherwise the correct number of rows would not be joined by peptide-HLA
NetTepi_Results=NetTepi_Results %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
print(c("NetTepi Results has same number of rows as test dataset? : ",NetTepi_Results %>% nrow == FullDataset %>% nrow))
## [1] "NetTepi Results has same number of rows as test dataset? : "
## [2] "TRUE"
NetTepi_Results=NetTepi_Results %>% mutate(Dataset = 'NETTEPI')
iPred
Run OTB
- The below .R script is taken from Shughay’s repo (https://github.com/antigenomics/ipred)
- Modifications are made only to the last three lines of the .R script, which is done to employ the trained model to predict P(immunogenicity) for our CoV2 peptides of interest.
# Write data to file and run below script to train model OTB and process it
FullDataset %>% select(Peptide) %>% write.table(file="peptides_for_OTB_analysis.txt",quote=F,row.names = F)
source("run_ipred_otb_NEOANTIGEN_TESLA.R")
read
- Below reads in the output from executing the model.
IPRED_RESULTS=fread("NEOANTIGEN_TESLA/IPRED_OTB/IPRED_RESULTS.txt")%>%dplyr::rename(Peptide=antigen.epitope,ImmunogenicityScore=imm.prob)
IPRED_RESULTS=IPRED_RESULTS %>% inner_join(FullDataset)
## Joining, by = "Peptide"
print(c("IPRED Results has same number of rows as test dataset? : ",IPRED_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IPRED Results has same number of rows as test dataset? : "
## [2] "TRUE"
IPRED_RESULTS=IPRED_RESULTS %>% mutate(Dataset = "IPRED")
Repitope
- Output data in a format readable by Repitope
FullDataset %>% dplyr::rename(MHC=HLA_Allele) %>% mutate(Dataset = "TESLA") %>% write.csv(file = "TESLA_TestData_ForRepitope.csv",quote=F,row.names = F)
Compute features, train MHC-I model and make predictions
- The below can take a considerable time (perhaps few hrs) to complete. Feature computation takes a while. I would suggest running each script individually to reduce the chance of issues with memory. I tend to use RScript via the command line.
# Compute the features for our test dataset and write them to FST file
source("compute_repitope_features_NEOANTIGEN_TESLA.R")
# Train and run the model against the GBM dataset
source("run_repitope_OTB_NEOANTIGEN_TESLA.R")
REpitope: Read in
- Repitope takes far too long to run live, so the results are imported.
REPITOPE_RESULTS = fread("NEOANTIGEN_TESLA/REPITOPE_OTB/ALL_PREDICTIONS.csv")
print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "TRUE"
REPITOPE_RESULTS=REPITOPE_RESULTS %>% full_join(FullDataset) %>% select(!"ImmunogenicityScore.cv")
REPITOPE_RESULTS= REPITOPE_RESULTS %>% mutate(Dataset = "REPITOPE")
print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "TRUE"
PRIME
Run
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*||-|HLA",replacement = "")
testdata=paste0("~/Documents/PRIMEDATA_NEOANTIGEN_TESLA/",HLA_ALLELE_FOR_TESTING,".txt")
data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0("~/Documents/PRIMEDATA_NEOANTIGEN_TESLA/",HLA_ALLELE_FOR_TESTING,"Results.txt")
system(paste0("/Applications/PRIME-master/PRIME -i ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -mix /Applications/MixMHCpred-2.1/MixMHCpred"," -o ", RESULTS_OUTPUT))
}
read
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/PRIME_OTB" # Changed for github but should reflect the location of the PRIME output above in 'RESULTS_OUTPUT'
# Find results files,m read in and combine
files <- dir(TEST_DATA_LOCATION, pattern = "Results.txt")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
# Turn into DT and select required columns
PRIME_RESULTS <- unnest(data3) %>% as.data.table %>% select(Peptide,BestAllele,Score_bestAllele)
# Munging the DT col names
PRIME_RESULTS=PRIME_RESULTS %>% dplyr::rename(ImmunogenicityScore=Score_bestAllele,HLA_Allele=BestAllele)
# Map the HLA allele nomenclature
PRIME_RESULTS$HLA_Allele = ClosestMatch2(PRIME_RESULTS$HLA_Allele, unique(FullDataset$HLA_Allele))
# join PRIME output with input test data
PRIME_RESULTS=PRIME_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
# Test same size of DT output from PRIME as input to PRIME
print(c("PRIME_RESULTS Results has same number of rows as test dataset? : ",PRIME_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "PRIME_RESULTS Results has same number of rows as test dataset? : "
## [2] "TRUE"
PRIME_RESULTS=PRIME_RESULTS %>% mutate(Dataset = "PRIME")
NetMHCpan EL
Run netmhcpan 4.0
- Processes data per allele.
- EL output, score on scale 0-1.
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/NETMHCPAN_4_L/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
Read in and process
# Read in and process
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/NETMHCPAN_4_L/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract allele from file name
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
# Map HLA allele nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
# Join with test dataset
Netmhcpanres=Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))%>% inner_join(FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
# Munge the final data table
Netmhcpanres=Netmhcpanres %>% select(Peptide, HLA_Allele,Immunogenicity,Ave) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_EL")
NetMHCpan BA
Run netmhcpan
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/NETMHCPAN_4_BA/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
TEST_DATA_LOCATION = "NEOANTIGEN_TESLA/NETMHCPAN_4_BA/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
NetmhcpanresBA <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
NetmhcpanresBA=NetmhcpanresBA %>% mutate(HLA_Allele = gsub(x=NetmhcpanresBA$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
NetmhcpanresBA$HLA_Allele = ClosestMatch2(NetmhcpanresBA$HLA_Allele,unique(FullDataset$HLA_Allele))
NetmhcpanresBA=NetmhcpanresBA%>% inner_join( FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
NetmhcpanresBA=NetmhcpanresBA %>% select(Peptide, HLA_Allele,Immunogenicity,Ave) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_BA")
DeepImmuno
- Can only process peptides of lengths only 9 and 10. We have pre-filtered for these lengths to compile the test dataset so does not affect the number of peptides here.
- We were unable to compile model locally, so instead we used the webserver and read the results in.
# Output the data for use on the webserver.
FullDataset %>% mutate(Length = width(Peptide)) %>% filter(Length %in% c(9,10)) %>% select(Peptide, HLA_Allele) %>% mutate(HLA_Allele = gsub("\\:","",HLA_Allele)) %>% write.csv(file="OUT_DEEPIMMUNO.csv",quote = F,row.names = F,col.names=F)
## Warning in write.csv(., file = "OUT_DEEPIMMUNO.csv", quote = F, row.names = F, :
## attempt to set 'col.names' ignored
# Read in webserver results
DEEPIMM=fread("NEOANTIGEN_TESLA/DEEPIMMUNO_OTB/result.txt") %>% dplyr::rename(Peptide =peptide, HLA_Allele=HLA,ImmunogenicityScore=immunogenicity)
# Map the HLA nomenclature
DEEPIMM$HLA_Allele = ClosestMatch2(DEEPIMM$HLA_Allele,unique(FullDataset$HLA_Allele))
DEEPIMM=DEEPIMM%>% distinct() %>% inner_join(FullDataset)%>%mutate(Dataset = "DeepImmuno")
## Joining, by = c("Peptide", "HLA_Allele")
print(c("DEEPIMM Results has same number of rows as test dataset? : ",DEEPIMM %>% nrow == FullDataset %>% nrow))
## [1] "DEEPIMM Results has same number of rows as test dataset? : "
## [2] "TRUE"
DeepHLAPan
- Ran on the webserver with default settings.
FullDataset %>% mutate(Annotation="TESLA") %>% select(Annotation,HLA_Allele,Peptide) %>% dplyr::rename(HLA=HLA_Allele,peptide=Peptide)%>% mutate(HLA=gsub("\\*","",HLA)) %>% readr::write_csv(file="DEEPHLAPAN_OUT_TESLA.csv")
DEEPHLAPAN_RESULTS = fread("NEOANTIGEN_TESLA/DEEPHLAPAN_OTB/DEEPHLAPAN_OUT_TESLA_predicted_result.csv")
DEEPHLAPAN_RESULTS=DEEPHLAPAN_RESULTS %>% select(!Annotation) %>% dplyr::rename(HLA_Allele=HLA, ImmunogenicityScore="immunogenic score")
# deephlapan use in neoag prediction, essentially uses a binding threshold of >0.5, (and immgen of that too). Binders are binary, so we first check if any
# are predictedf not to bind. if not, we just take the immgen score
# none observed
DEEPHLAPAN_RESULTS %>% filter("binding score" < 0.5)
## Empty data.table (0 rows and 4 cols): HLA_Allele,Peptide,binding score,ImmunogenicityScore
DEEPHLAPAN_RESULTS$HLA_Allele = ClosestMatch2(DEEPHLAPAN_RESULTS$HLA_Allele,unique(FullDataset$HLA_Allele))
DEEPHLAPAN_RESULTS=DEEPHLAPAN_RESULTS %>% inner_join(FullDataset)%>%mutate(Dataset = "DeepHLApan")%>% select(! "binding score")
## Joining, by = c("HLA_Allele", "Peptide")
GAO predictor
- Gao was executed with default settings.
- Results are read in below
- Example script is provided (NEOANTIGEN_TESLA/GAO_OTB/analyze.R), but will not execute until file paths are corrected.
output
library(seqinr)
#model not coded nicely. need to adapt
# process by HLA
TEST_DATA_LOCATION= "/Users/paulbuckley/Documents/GAO_immunogenicity_predictor_OTB/examples/TESLA_PB/"
FullDataset %>% select(HLA_Allele, Peptide) %>%mutate(HLA_Allele = gsub("\\*","",HLA_Allele))%>% group_by(Peptide) %>% dplyr::summarise(HLA_Allele = unique(paste0(HLA_Allele,collapse = ",")))%>% select(HLA_Allele, Peptide)%>%
readr::write_tsv(file=paste0(TEST_DATA_LOCATION,"peptides_MHC.txt"),col_names = FALSE)
# run analyze.R
read
TEST_DATA_LOCATION= "NEOANTIGEN_TESLA/GAO_OTB/"
files <- dir(TEST_DATA_LOCATION, pattern = ".csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(TEST_DATA_LOCATION, .)))
)
GAO_RESULTS <- unnest(data3) %>% as.data.table
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
FullDataset %>% inner_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA)) %>% nrow
## Joining, by = c("HLA_Allele", "Peptide")
## [1] 399
GAO_RESULTS=FullDataset %>% inner_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA))
## Joining, by = c("HLA_Allele", "Peptide")
GAO_RESULTS=GAO_RESULTS %>% select(!file) %>% dplyr::rename(ImmunogenicityScore=amplitude) %>% select(!immunogenic)%>% mutate(Dataset = "GAO")
Results
Combine all data into DT ‘combinedData’
- Confirm each model (Dataset column) has 399 obs each
# Bind all the model results together
combinedData = rbind(IEDB_RESULTS,NetTepi_Results%>% select(!Epitopes),IPRED_RESULTS,REPITOPE_RESULTS,PRIME_RESULTS, Netmhcpanres %>% mutate(Length = nchar(Peptide)), NetmhcpanresBA %>% mutate(Length = nchar(Peptide)), GAO_RESULTS,DEEPHLAPAN_RESULTS,DEEPIMM)
ALLOWEDLENGTHS = c(9,10)# Does not filter anything in this setting.
combinedData = combinedData%>% mutate(Length = width(Peptide)) %>% filter(Length %in% ALLOWEDLENGTHS)%>% select(!Length)
combinedData %>% select(Dataset) %>% table
## .
## DeepHLApan DeepImmuno GAO IEDB IPRED NETTEPI
## 399 399 399 399 399 399
## PRIME REPITOPE netMHCpan_BA netMHCpan_EL
## 399 399 399 399
combinedData %>% readr::write_csv(file="/Users/paulbuckley/Dropbox/ImmunogenicityBenchmark_V2_EmergingViruses/ImmunogenicityBenchmark_V2_EmergingViruses_BIB_RR/Supp_Files/3_TESLA_MODEL_IMMSCORES.csv")
Produce PR-AUC
# Set factor levels
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels = c("Positive","Negative"))
# Create 'DATA_FOR_PR'. This is for plotting. We modify the model name labels and colour labels to ensure consistency between ROC-AUC and PR-AUC plots
DATA_FOR_PR = combinedData
# Change model labels to ensure consistency between PR-AUC and ROC-AUC
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IEDB',]$Dataset = "IEDB_Model"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IPRED',]$Dataset = "iPred"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'NETTEPI',]$Dataset = "NetTepi"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'REPITOPE',]$Dataset = "REpitope"
#Calculate the praucs
PR_AUC_COMBINED=DATA_FOR_PR %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_AUC_COMBINED$.estimate=round(PR_AUC_COMBINED$.estimate,digits=3)
# Produce and plot PR-AUC curve
pr_AUC=DATA_FOR_PR %>% group_by(Dataset) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO","DeepHLApan"))) %>% pr_curve(Immunogenicity,ImmunogenicityScore) %>% autoplot() + aes(size = Dataset)+scale_size_manual(values=c(1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25)) +annotate(hjust=0,"size"=4,"text",x=.60,y=.775,label=paste0("IEDB_Model: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'IEDB_Model')%>%pull(".estimate"),"\n","iPred: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'iPred')%>%pull(".estimate"),"\n","NetTepi: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'NetTepi')%>%pull(".estimate"),"\n","Repitope: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'REpitope')%>%pull(".estimate"),"\n","PRIME: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'PRIME')%>%pull(".estimate"),"\n","DeepImmuno: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'DeepImmuno')%>%pull(".estimate"),"\n","netMHCpan_EL: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_EL')%>%pull(".estimate"),"\n","netMHCpan_BA: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_BA')%>%pull(".estimate"),"\n","GAO: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'GAO')%>%pull(".estimate"),"\n","DeepHLApan ",PR_AUC_COMBINED %>% filter(Dataset %in% 'DeepHLApan')%>%pull(".estimate") )) + geom_hline(size=1,color="darkgrey",yintercept = nrow(FullDataset[FullDataset$Immunogenicity=='Positive',]) / nrow(FullDataset),linetype="dashed")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=14)+theme(panel.background = element_rect(colour = "black", size=0.5))+ coord_fixed(xlim = 0:1, ylim = 0:1)
pr_AUC

saveRDS(pr_AUC,file="TESLA_PRAUC.rds")
Using ROC-AUC optimal threshold, compute model metrics
- Loop through each model, use youden index to calculate an optimal threshold based on roc curves
- Use optimal threshold to generate a binary prediction.
- Use binary prediction to calculate model metrics e.g., precision, recall, etc.
library(caret)
MODEL_METRICS=foreach(i = 1:length(unique(combinedData$Dataset)), .combine = "rbind")%do% {
MODEL = unique(combinedData$Dataset)[i]
ROC_MODEL=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% MODEL))
threshold = coords(roc=ROC_MODEL, x="best", input="threshold", best.method="youden", transpose=F)$threshold
threshold_data = combinedData %>% filter(Dataset %in% MODEL)%>% mutate(ImmunogenicityPrediction = ifelse(ImmunogenicityScore > threshold, "Positive","Negative"))
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(threshold_data %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(threshold_data %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
TEST=as.matrix(CM, what = "classes")
data.table("Metrics"=row.names(TEST), TEST)%>% pivot_wider(names_from = Metrics, values_from = V1)%>% mutate(Dataset = MODEL)%>% mutate_if(is.numeric, round, digits=3)
}
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
MODEL_METRICS%>% DT::datatable()
Using ROC-AUC optimal threshold, compute and visualise confusion matrices
foreach(i = 1:length(unique(combinedData$Dataset)))%do% {
MODEL = unique(combinedData$Dataset)[i]
ROC_MODEL=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% MODEL))
threshold = coords(roc=ROC_MODEL, x="best", input="threshold", best.method="youden", transpose=F)$threshold
threshold_data = combinedData %>% filter(Dataset %in% MODEL)%>% mutate(ImmunogenicityPrediction = ifelse(ImmunogenicityScore > threshold, "Positive","Negative"))
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(threshold_data %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(threshold_data %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
CMplot=ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle(MODEL)+ font("xy.text",size=18,color="black")+ font("xlab",size=18,color="black")+ font("ylab",size=18,color="black") + theme(plot.title = element_text(size=18))+ font("legend.text",size=12)
#plot(CMplot)
}
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

PR-AUC Bootstrap analysis
- shuffle the immunogenicity labels of the peptides 1000 times
- Compute new PR-AUC score each time given shuffled labels, to generate distribution of random PR-AUC
- Plot distribution and compare to real PR-AUC
- Factor levels are changed to ensure consistency of namings and colours across plots
set.seed(41)
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
PR_AUC_RAND.DIST=foreach(i = 1:1000, .combine = rbind,.packages = c("dplyr","magrittr","yardstick","data.table")) %dopar% {
DATA_FOR_PR %>% group_by(Dataset) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO","DeepHLApan"))) %>% mutate(Shuffled_Immunogenicity=sample(size=n(),Immunogenicity)) %>% mutate(Shuffled_Immunogenicity=factor(Shuffled_Immunogenicity,levels = c("Positive","Negative"))) %>%
pr_auc(Shuffled_Immunogenicity,ImmunogenicityScore) %>% mutate(sampleNum=i)
}
stopCluster(cl)
# Real PR-AUCs
PR_AUC_COMBINED=DATA_FOR_PR %>% mutate(Immunogenicity=factor(Immunogenicity,levels = c("Positive","Negative"))) %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO","DeepHLApan"))) %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
# Compare Real and bootstrapped
PR_RAND_DIST_FIG=PR_AUC_RAND.DIST %>%
mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO","DeepHLApan"))) %>% ggdensity(x=".estimate",y="..density..",fill="Dataset",add="mean",color = "Dataset",alpha=0.3)+theme_pubr(base_size = 18) + facet_wrap(~Dataset) + geom_vline(data=PR_AUC_COMBINED,aes(xintercept=.estimate),color="black",linetype="dashed") + xlab("Area under the precision-recall curve") + theme(legend.position = "none")+rotate_x_text(angle=90)
PR_RAND_DIST_FIG

PR_AUC_RAND.DIST %>% group_by(Dataset) %>% dplyr::summarise(Random_mean=mean(.estimate),sd=sd(.estimate)) %>% inner_join(PR_AUC_COMBINED %>% select(!c(.metric,.estimator))) %>% dplyr::rename(Predicted=.estimate) %>% mutate(zscore = round(((Predicted-Random_mean)/sd),2) ) %>% DT::datatable(caption="Mean, sd and zscore to show distance from mean of random distribution")
## Joining, by = "Dataset"